library(CrossClustering)
library(cluster)
rm(list=ls())

# data load

load("Brain2002.RData")
dati <- gene_Brain2002

table(group_Brain2002$Group) # 42 samples / 5 clusters

gene_Brain2002 <- scale(log2(gene_Brain2002)) # log2 transformed and scaled

# true partition
true_part <- numeric()
true_part[group_Brain2002$Group=="Brain_MD"]=1
true_part[group_Brain2002$Group=="Brain_MGlio"]=2
true_part[group_Brain2002$Group=="Brain_Ncer"]=3
true_part[group_Brain2002$Group=="Brain_PNET"]=4
true_part[group_Brain2002$Group=="Brain_Rhab"]=5

###### CrossClustering

d_euc <- dist(t(gene_Brain2002), method="euclidean")

CC_euc <- cc_crossclustering(
                          dist=d_euc,
                          k_w_min = 2,
                          k_w_max = 19,
                          k2_max = 20,
                          out = TRUE,
                          method = "complete"
                        )

CrossClustering::ari(table(
  CrossClustering::cc_get_cluster(CC_euc),
  true_part), 
  alpha = 0.05, digits = 2) # ARI=0.64

# comparison with Ward & Complete-linkage
clusto <- function(x, distMat){
  cutree(hclust(distMat, method="ward.D"), k=x)
}

clucomp <- function(x, distMat){
  cutree(hclust(distMat, method="complete"), k=x)
}

members_euc.ward <- sapply(2:20, clusto, distMat=d_euc)
members_euc.comp <- sapply(2:20, clucomp, distMat=d_euc)

Sil_euc.ward <-   numeric(19)
Sil_euc.complete <-   numeric(19)

for (c in 1:ncol(members_euc.ward)) {
  Sil_euc.ward[c] <- summary(silhouette(x=members_euc.ward[,c], 
                                        dist=d_euc))$avg.width
  Sil_euc.complete[c] <- summary(silhouette(x=members_euc.comp[,c], 
                                            dist=d_euc))$avg.width
}

names(Sil_euc.ward) <- names(Sil_euc.complete) <- paste("Clus", 2:20, sep="")

table(members_euc.ward[,which.max(Sil_euc.ward)]) # 2 clusters, Silhouette=0.19474538
table(members_euc.comp[,which.max(Sil_euc.complete)]) # 2 clusters, Silhouette=0.19383346

groupsGE_ward_euc <- members_euc.ward[,which.max(Sil_euc.ward)]
groupsGE_comp_euc <- members_euc.comp[,which.max(Sil_euc.complete)]

CrossClustering::ari(table(
  groupsGE_ward_euc,
  true_part), 
  alpha = 0.05, digits = 2) # ARI=0.14

CrossClustering::ari(table(
  groupsGE_comp_euc,
  true_part), 
  alpha = 0.05, digits = 2) # ARI=0.18

# dendograms

source("http://addictedtor.free.fr/packages/A2R/lastVersion/R/code.R")

Subtypes=group_Brain2002$Group
Subtypes <- as.character(Subtypes)
Subtypes[Subtypes=="Brain_MD"] <- "MD"
Subtypes[Subtypes=="Brain_MGlio"] <- "MGlio"
Subtypes[Subtypes=="Brain_Ncer"] <- "Ncer"
Subtypes[Subtypes=="Brain_PNET"] <- "PNET"
Subtypes[Subtypes=="Brain_Rhab"] <- "Rhab"

my.clu_ward <- hclust(d_euc, method="ward.D")
my.clu_comp <- hclust(d_euc, method="complete")

# Figure S121
A2Rplot(my.clu_ward,
        k=2,
        fact.sup=Subtypes,
        boxes = FALSE,
        col.up = "gray", #criteria=true_part,
        col.down = 1:2, show.labels=FALSE, main="Ward")

# Figure S122
A2Rplot(my.clu_comp,
        k=2,
        fact.sup=Subtypes,
        boxes = FALSE,
        col.up = "gray",
        col.down = 1:2, show.labels=FALSE, main="Complete")

### K-means ###

## K-means choosing the number of clusters with the Silhouette.
n  <- 42 #samples
km1 <- km2 <- km3 <- km4 <- km5 <- km6 <- km7 <- km8 <- km9 <- km10 <- matrix(NA, ncol=19, nrow=n)
SilKm1 <- SilKm2 <- SilKm3 <- SilKm4 <- SilKm5 <- SilKm6 <- SilKm7 <- SilKm8 <- SilKm9 <- SilKm10 <- numeric(19)

d=d_euc
data=t(gene_Brain2002)

for (k in 2:20){
  km1[,k-1] <- kmeans(data, c=k)$cluster
  SilKm1[k-1] <- mean(silhouette(as.numeric(km1[,k-1]), dist=d)[,3])
  
  km2[,k-1] <- kmeans(data, c=k)$cluster
  SilKm2[k-1] <- mean(silhouette(as.numeric(km2[,k-1]), dist=d)[,3])
  
  km3[,k-1] <- kmeans(data, c=k)$cluster
  SilKm3[k-1] <- mean(silhouette(as.numeric(km3[,k-1]), dist=d)[,3])
  
  km4[,k-1] <- kmeans(data, c=k)$cluster
  SilKm4[k-1] <- mean(silhouette(as.numeric(km4[,k-1]), dist=d)[,3])
  
  km5[,k-1] <- kmeans(data, c=k)$cluster
  SilKm5[k-1] <- mean(silhouette(as.numeric(km5[,k-1]), dist=d)[,3])
  
  km6[,k-1] <- kmeans(data, c=k)$cluster
  SilKm6[k-1] <- mean(silhouette(as.numeric(km6[,k-1]), dist=d)[,3])
  
  km7[,k-1] <- kmeans(data, c=k)$cluster
  SilKm7[k-1] <- mean(silhouette(as.numeric(km7[,k-1]), dist=d)[,3])
  
  km8[,k-1] <- kmeans(data, c=k)$cluster
  SilKm8[k-1] <- mean(silhouette(as.numeric(km8[,k-1]), dist=d)[,3])
  
  km9[,k-1] <- kmeans(data, c=k)$cluster
  SilKm9[k-1] <- mean(silhouette(as.numeric(km9[,k-1]), dist=d)[,3])
  
  km10[,k-1] <- kmeans(data, c=k)$cluster
  SilKm10[k-1] <- mean(silhouette(as.numeric(km10[,k-1]), dist=d)[,3])
}  

lista_km <- list(km1, km2, km3, km4, km5, km6, km7, km8, km9, km10)
lista_Silkm <- list(SilKm1, SilKm2, SilKm3, SilKm4, SilKm5, SilKm6, 
                    SilKm7, SilKm8, SilKm9, SilKm10)

max(sapply(lista_Silkm, max)) # max Silhouette=0.1700082

RandKm <- CrossClustering::ari(table(lista_km[[which.max(sapply(lista_Silkm, max))]][,sapply(lista_Silkm, which.max)[which.max(sapply(lista_Silkm, max))]],
                            as.integer(group_Brain2002$Gr))) # ARI=0.14

## SOM 

library(kohonen)

griglia <- matrix(NA, ncol=3, nrow=70)
griglia[,1] <- c(rep(1, 19), rep(2, 10), rep(3, 7), rep(4, 5), rep(5, 4), rep(6, 4), rep(7, 3), rep(8, 3), rep(9, 3), rep(10, 2), 11:20)
griglia[,2] <- c(2:20, 1:10, 1:7, 1:5, 1:4, 1:4, 1:3, 1:3, 1:3, 1:2, rep(1, 10))
griglia[,3]  <- griglia[,1]*griglia[,2]

my.som <- matrix(NA, nrow=n, ncol=nrow(griglia))

for (c in 1:nrow(griglia)){
  my.som[,c] <- som(data, grid=somgrid(xdim=griglia[c,1], ydim=griglia[c,2], 
                                       "rectangular"))$unit.classif
}

SilSom <- numeric(ncol(my.som))
for (c in 1:ncol(my.som)){
  SilSom[c] <- mean(silhouette(as.numeric(my.som[,c]), dist=d)[,3])
}

max(SilSom) #  max Silhoutte=0.1731547
table(my.som[,which.max(SilSom)]) # 2 clusters

som <- matrix(NA, nrow=n, ncol=10)
colnames(som) <- paste("Prova", 1:10, sep="")
for (c in 1:10){
  som[,c] <- som(data, grid=somgrid(xdim=1, ydim=2, "rectangular"))$unit.classif
}

sil_som <- numeric(10)
for (c in 1:10){
  sil_som[c] <- mean(silhouette(as.numeric(som[,c]), dist=d)[,3])
}

max(sil_som) # max Silhoutte=0.2294951

CrossClustering::ari(table(som[,which.max(sil_som)], as.integer(group_Brain2002$Gr))) # ARI=0

# DBSCAN

library(RANN)
library(fpc)

nearest4 <- nn2(data=t(dati), k=4)
plot(nearest4[[2]][order(nearest4[[2]][,4], decreasing=T),4], type="l", axes=F, 
     ylab="3-dist measure", xlab="points")
axis(2)
axis(1, at=seq(1, 91, by=1))
abline(v=6, col="red")
nearest4[[2]][order(nearest4[[2]][,4], decreasing=T),4][6]

nearest5 <- nn2(data=t(dati), k=5)
plot(nearest5[[2]][order(nearest5[[2]][,5], decreasing=T),5], type="l", axes=F, 
     ylab="4-dist measure", xlab="points")
axis(2)
axis(1, at=seq(1, 91, by=1))
abline(v=4, col="red")
nearest5[[2]][order(nearest5[[2]][,5], decreasing=T),5][4]

nearest6 <- nn2(data=t(dati), k=6)
plot(nearest6[[2]][order(nearest6[[2]][,6], decreasing=T),6], type="l", axes=F, 
     ylab="5-dist measure", xlab="points")
axis(2)
axis(1, at=seq(1, 91, by=1))
abline(v=5, col="red")
nearest6[[2]][order(nearest6[[2]][,6], decreasing=T),6][5]

nearest7 <- nn2(data=t(dati), k=7)
plot(nearest7[[2]][order(nearest7[[2]][,7], decreasing=T),7], type="l", axes=F, 
     ylab="6-dist measure", xlab="points")
axis(2)
axis(1, at=seq(1, 91, by=1))
abline(v=4, col="red")
nearest7[[2]][order(nearest7[[2]][,7], decreasing=T),7][4]

nearest8 <- nn2(data=t(dati), k=8)
plot(nearest8[[2]][order(nearest8[[2]][,8], decreasing=T),8], type="l", axes=F, 
     ylab="7-dist measure", xlab="points")
axis(2)
axis(1, at=seq(1, 91, by=1))
abline(v=4, col="red")
nearest8[[2]][order(nearest8[[2]][,8], decreasing=T),8][4]

db4=dbscan(t(dati), eps=nearest5[[2]][order(nearest5[[2]][,5], decreasing=T),5][6], MinPts=4)$cluster
db5=dbscan(t(dati), eps=nearest6[[2]][order(nearest6[[2]][,6], decreasing=T),6][4], MinPts=5)$cluster
db6=dbscan(t(dati), eps=nearest7[[2]][order(nearest7[[2]][,7], decreasing=T),7][5], MinPts=6)$cluster
db7=dbscan(t(dati), eps=nearest8[[2]][order(nearest8[[2]][,8], decreasing=T),8][4], MinPts=7)$cluster

#### Heatmap ###

# relabeling for graphical purposes
cc=CrossClustering::cc_get_cluster(CC_euc)[order(true_part)]

somme=som[,which.max(sil_som)][order(true_part)]
somme[somme==1]<-3

ward=members_euc.ward[,which.max(Sil_euc.ward)][order(vera_part)]
ward[ward==1]<-3
comp<-members_euc.comp[,which.max(Sil_euc.complete)][order(vera_part)]
comp[comp==1]<-3

km<-lista_km[[which.max(sapply(lista_Silkm, max))]][,sapply(lista_Silkm, which.max)[which.max(sapply(lista_Silkm, max))]][order(vera_part)]
km[km==1]<-3

true<-true_part[order(true_part)]
true[true==1]<-6

db<-db4+1

hm <- matrix(NA, nrow=n, ncol=7)
colnames(hm) <- c("Ward", "CL", "K-means", "SOM", "DBSCAN", "CC", "Membership")
rownames(hm) <- paste("Elem", 1:n, sep="")
hm[,1] <- ward
hm[,2] <- comp
hm[,3] <- km
hm[,4] <- somme
hm[,5] <- db
hm[,6] <- cc
hm[,7] <- true

library(fields)
my_palette <- colorRampPalette(c("white", "yellow", "red", "springgreen3", "violet", "steelblue1", "steelblue4"))(n = 10)
par(mar=c(5,4.5,4,7))
image(hm, col=my_palette, axes=F, xlab="Samples")
mtext(text=colnames(hm), side=2, line=0.3, at=c(.0,.16,.35,.5,.65,.85,1), las=1, cex=0.65)
image.plot(hm, col=my_palette, legend.only=T, 
           legend.lab="Cluster Membership", 
           #          breaks=brks, legend.shrink=0.8,
           lab.breaks=c("Outliers", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"))
title("Brain tumors data")
